{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征工程on数据集Rent Listing Inqueries "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "import必要的工具包，用于文件读取／特征编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "#用于计算feature字段的文本特征提取\n",
    "from sklearn.feature_extraction.text import  CountVectorizer\n",
    "#from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "#CountVectorizer为稀疏特征，特征编码结果存为稀疏矩阵xgboost处理更高效\n",
    "from scipy import sparse\n",
    "\n",
    "#对类别型特征进行编码\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from MeanEncoder import MeanEncoder\n",
    "\n",
    "#对地理位置通过聚类进行离散化\n",
    "from sklearn.cluster import KMeans\n",
    "from nltk.metrics import distance as distance"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "数据文件路径和文件名"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#input data\n",
    "dpath = './data/'\n",
    "train = pd.read_json(\"data/RentListingInquries_train.json\")\n",
    "test = pd.read_json(\"data/RentListingInquries_test.json\")\n",
    "\n",
    "#train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#train.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 标签interest_level"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 将类别型的标签interest_level编码为数字\n",
    "从前面的分析和常识来看，listing_id对确定interest_level没有用，去掉"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_map = {'low': 2, 'medium': 1, 'high': 0}\n",
    "train['interest_level'] = train['interest_level'].apply(lambda x: y_map[x])\n",
    "\n",
    "y_train = train['interest_level']\n",
    "train.drop(['listing_id', 'interest_level'], axis=1,inplace = True)\n",
    "\n",
    "test.drop(['listing_id'], axis=1,inplace = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## price, bathrooms, bedrooms\n",
    "数值型特征，+／-／*／ ／\n",
    "特征的单调变换对XGBoost不必要"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def remove_noise(df):\n",
    "#remove some noise\n",
    "    df= df[df.price < 10000]\n",
    "\n",
    "    df.loc[df[\"bathrooms\"] == 112, \"bathrooms\"] = 1.5\n",
    "    df.loc[df[\"bathrooms\"] == 10, \"bathrooms\"] = 1\n",
    "    df.loc[df[\"bathrooms\"] == 20, \"bathrooms\"] = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#构造新特征\n",
    "#price_bathrooms：单位bathroom的价格\n",
    "#price_bedrooms：单位bedroom的价格\n",
    "def create_price_room(df):\n",
    "    df['price_bathrooms'] =  (df[\"price\"])/ (df[\"bathrooms\"] +1.0)\n",
    "    df['price_bedrooms'] =  (df[\"price\"])/ (df[\"bedrooms\"] +1.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#构造新特征\n",
    "#room_diff：bathroom房间数 - bedroom房间数\n",
    "#room_num：bathroom房间数 + bedroom房间数\n",
    "def create_room_diff_sum(df):\n",
    "    df[\"room_diff\"] = df[\"bathrooms\"] - df[\"bedrooms\"]\n",
    "    df[\"room_num\"] = df[\"bedrooms\"] + df[\"bathrooms\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 创建日期created"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def procdess_created_date(df):\n",
    "    df['Date'] = pd.to_datetime(df['created'])\n",
    "    df['Year'] = df['Date'].dt.year\n",
    "    df['Month'] = df['Date'].dt.month\n",
    "    df['Day'] = df['Date'].dt.day\n",
    "    df['Wday'] = df['Date'].dt.dayofweek\n",
    "    df['Yday'] = df['Date'].dt.dayofyear\n",
    "    df['hour'] = df['Date'].dt.hour\n",
    "\n",
    "    df.drop(['Date', 'created'], axis=1,inplace = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## description"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#简单丢弃，也可以参照fature特征处理方式\n",
    "def procdess_description(df):\n",
    "    df.drop(['description'], axis=1,inplace = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## manager_id\n",
    "将manager分为几个等级\n",
    "top 1%， 2%， 5， 10， 15， 20， 25， 30， 50，"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def procdess_manager_id(df):\n",
    "    managers_count = df['manager_id'].value_counts()\n",
    "\n",
    "    df['top_10_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[\n",
    "        managers_count.values >= np.percentile(managers_count.values, 90)] else 0)\n",
    "    df['top_25_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[\n",
    "        managers_count.values >= np.percentile(managers_count.values, 75)] else 0)\n",
    "    df['top_5_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[\n",
    "        managers_count.values >= np.percentile(managers_count.values, 95)] else 0)\n",
    "    df['top_50_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[\n",
    "        managers_count.values >= np.percentile(managers_count.values, 50)] else 0)\n",
    "    df['top_1_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[\n",
    "        managers_count.values >= np.percentile(managers_count.values, 99)] else 0)\n",
    "    df['top_2_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[\n",
    "        managers_count.values >= np.percentile(managers_count.values, 98)] else 0)\n",
    "    df['top_15_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[\n",
    "        managers_count.values >= np.percentile(managers_count.values, 85)] else 0)\n",
    "    df['top_20_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[\n",
    "        managers_count.values >= np.percentile(managers_count.values, 80)] else 0)\n",
    "    df['top_30_manager'] = df['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[\n",
    "        managers_count.values >= np.percentile(managers_count.values, 70)] else 0)\n",
    "    \n",
    "    df.drop(['manager_id'], axis=1,inplace = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## building_id\n",
    "类似manager_id处理\n",
    "直接删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def procdess_building_id(df):\n",
    "    df.drop(['building_id'], axis=1,inplace = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## photos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def procdess_photos(df):\n",
    "    #df['photos_count'] = df['photos'].apply(lambda x: len(x))\n",
    "    df.drop(['photos'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## latitude, longtitude\n",
    "聚类降维编码(#用训练数据训练，对训练数据和测试数据都做变换)\n",
    "到中心的距离（论坛上讨论到曼哈顿中心的距离更好）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def procdess_location_train(df):   \n",
    "    train_location = df.loc[:,[ 'latitude', 'longitude']]\n",
    "    \n",
    "     # Clustering\n",
    "    kmeans_cluster = KMeans(n_clusters=20)\n",
    "    res = kmeans_cluster.fit(train_location)\n",
    "    res = kmeans_cluster.predict(train_location)\n",
    "\n",
    "    df['cenroid'] = res\n",
    "\n",
    "    # L1 distance\n",
    "    center = [ train_location['latitude'].mean(), train_location['longitude'].mean()]\n",
    "    df['distance'] = abs(df['latitude'] - center[0]) + abs(df['longitude'] - center[1])\n",
    "    \n",
    "    #原始特征也可以考虑保留，此处简单丢弃\n",
    "    df.drop(['latitude', 'longitude'], axis=1, inplace=True)\n",
    "    \n",
    "    return kmeans_cluster,center"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def procdess_location_test(df, kmeans_cluster, center):   \n",
    "    test_location = df.loc[:,[ 'latitude', 'longitude']]\n",
    "    \n",
    "     # Clustering\n",
    "    res = kmeans_cluster.predict(test_location)\n",
    "\n",
    "    df['cenroid'] = res\n",
    "\n",
    "    # L1 distance\n",
    "    df['distance'] = abs(df['latitude'] - center[0]) + abs(df['longitude'] - center[1])\n",
    "    df.drop(['latitude', 'longitude'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## display_address\n",
    "定义高基数类别型特征编码函数\n",
    "对这些特征进行均值编码（该特征值在每个类别的概率，即原来的一维特征变成了C-1维特征，C为标签类别数目）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def procdess_display_address_train_test(df_train, y_train, df_test):\n",
    "    n_train_samples = len(df_train.index)    \n",
    "    df_train_test = pd.concat((df_train, df_test), axis=0)\n",
    "\n",
    "    lb = LabelEncoder()\n",
    "    lb.fit(list(df_train_test['display_address'].values))\n",
    "    df_train_test ['display_address'] = lb.transform(list(df_train_test['display_address'].values))\n",
    "    \n",
    "    #import pdb\n",
    "    #pdb.set_trace()\n",
    "    me = MeanEncoder(['display_address'], target_type='classification')\n",
    "    df_train_test = me.fit_transform(df_train_test, y_train)\n",
    "\n",
    "    df_train_test.drop(['display_address'], axis=1,inplace = True)\n",
    "    \n",
    "    df_train = df_train_test.iloc[:n_train_samples, :]\n",
    "    df_test = df_train_test.iloc[n_train_samples:, :]\n",
    "    \n",
    "    return df_train, df_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def procdess_display_address_test(df, lb, me):\n",
    "    #要警惕test中出现train中没有的特征取值，最好train和test一起处理\n",
    "    df['display_address'] = lb.transform(list(df['display_address'].values))\n",
    "    df = me.transform(df)\n",
    "\n",
    "    df.drop(['display_address'], axis=1,inplace = True)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## street_address"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 和display_address信息冗余，去掉\n",
    "def procdess_street_address(df):\n",
    "    df = df.drop(['street_address'], axis=1,inplace = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## features\n",
    "描述特征文字长度\n",
    "特征中单词的词频，相当于以数据集features中出现的词语为字典的one-hot编码（虽然是词频，但在这个任务中每个单词通常只出现一次）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def procdess_features_train_test(df_train, df_test):\n",
    "    n_train_samples = len(df_train.index)\n",
    "    \n",
    "    df_train_test = pd.concat((df_train, df_test), axis=0)\n",
    "    df_train_test['features2'] = df_train_test['features']\n",
    "    df_train_test['features2'] = df_train_test['features2'].apply(lambda x: ' '.join(x))\n",
    "\n",
    "    c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1), decode_error='ignore')\n",
    "    c_vect_sparse = c_vect.fit_transform(df_train_test['features2'])\n",
    "    c_vect_sparse_cols = c_vect.get_feature_names()\n",
    "\n",
    "    df_train.drop(['features'], axis=1, inplace=True)\n",
    "    df_test.drop(['features'], axis=1, inplace=True)\n",
    "    \n",
    "    #hstack作为特征处理的最后一部，先将其他所有特征都转换成数值型特征才能处理,稀疏表示\n",
    "    df_train_sparse = sparse.hstack([df_train, c_vect_sparse[:n_train_samples,:]]).tocsr()\n",
    "    df_test_sparse = sparse.hstack([df_test, c_vect_sparse[n_train_samples:,:]]).tocsr()\n",
    "    \n",
    "    #常规datafrmae\n",
    "    tmp = pd.DataFrame(c_vect_sparse.toarray()[:n_train_samples,:],columns = c_vect_sparse_cols, index=df_train.index)\n",
    "    df_train = pd.concat([df_train, tmp], axis=1)\n",
    "    \n",
    "    tmp = pd.DataFrame(c_vect_sparse.toarray()[n_train_samples:,:],columns = c_vect_sparse_cols, index=df_test.index)\n",
    "    df_test = pd.concat([df_test, tmp], axis=1)\n",
    "    \n",
    "    #df_test = pd.concat([df_test, tmp[n_train_samples:,:]], axis=1)\n",
    "  \n",
    "    return df_train_sparse,df_test_sparse,df_train, df_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def procdess_features_test(df, c_vect):\n",
    "    df['features2'] = df['features']\n",
    "    df['features2'] = df['features2'].apply(lambda x: ' '.join(x))\n",
    "\n",
    "    c_vect_sparse = c_vect.transform(df['features2'])\n",
    "    c_vect_sparse_cols = c_vect.get_feature_names()\n",
    "\n",
    "    df.drop(['features', 'features2'], axis=1, inplace=True)\n",
    "    \n",
    "    #hstack作为特征处理的最后一部，先将其他所有特征都转换成数值型特征才能处理\n",
    "    df_sparse = sparse.hstack([df, c_vect_sparse]).tocsr()\n",
    "    \n",
    "    tmp = pd.DataFrame(c_vect_sparse.toarray(),columns = c_vect_sparse_cols, index=df.index)\n",
    "    df = pd.concat([df, tmp], axis=1)\n",
    "    \n",
    "    return df_sparse, df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 对训练样本做特征工程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/qing/anaconda2/lib/python2.7/site-packages/pandas/core/indexing.py:517: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  self.obj[item] = s\n"
     ]
    }
   ],
   "source": [
    "remove_noise(train)\n",
    "\n",
    "create_price_room(train)\n",
    "create_room_diff_sum(train)\n",
    "\n",
    "procdess_created_date(train)\n",
    "\n",
    "procdess_description(train)\n",
    "\n",
    "procdess_manager_id(train)\n",
    "\n",
    "procdess_building_id(train)\n",
    "procdess_photos(train)\n",
    "\n",
    "kmeans_cluster,center = procdess_location_train(train)\n",
    "procdess_street_address(train)\n",
    "\n",
    "#测试集中可能出现新的特征值，所以训练和测试集一起做\n",
    "#lb, me, train = procdess_display_address_train(train, y_train)\n",
    "#X_train_sparse,X_test_sparse,train,test = procdess_features_train_test(train,test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 对测试样本做特征工程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "remove_noise(test)\n",
    "\n",
    "create_price_room(test)\n",
    "create_room_diff_sum(test)\n",
    "\n",
    "procdess_created_date(test)\n",
    "\n",
    "procdess_description(test)\n",
    "\n",
    "procdess_manager_id(test)\n",
    "\n",
    "procdess_building_id(test)\n",
    "procdess_photos(test)\n",
    "\n",
    "procdess_location_test(test, kmeans_cluster, center)\n",
    "\n",
    "procdess_street_address(test)\n",
    "\n",
    "#测试数据出现了训练数据中没有出现的词语，报错，可以训练数据和测试数据一起训练CountVectorizer\n",
    "#test = procdess_display_address_test(test, lb, me )\n",
    "#X_test_sparse,test = procdess_features_test(test, c_vect)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "MeanEncoder.py:54: FutureWarning: using a dict on a Series for aggregation\n",
      "is deprecated and will be removed in a future version\n",
      "  col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})\n"
     ]
    }
   ],
   "source": [
    "train,test = procdess_display_address_train_test(train, y_train,test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train_sparse,X_test_sparse,train,test = procdess_features_train_test(train,test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征处理结果存为文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#存为csv格式方便用excel查看(属性名字有重复，features得到的词语中也有bathrooms和bedrooms)\n",
    "train = pd.concat([train, y_train], axis=1)\n",
    "train.to_csv(dpath + 'RentListingInquries_FE_train.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test.to_csv(dpath + 'RentListingInquries_FE_test.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "#from  scipy.io import mmwrite\n",
    "#train_sparse = sparse.hstack([X_train_sparse, sparse.csr_matrix(y_train).T]).tocsr()\n",
    "\n",
    "#mmwrite(dpath + 'RentListingInquries_FE_train.txt',train_sparse)\n",
    "#mmwrite(dpath + 'RentListingInquries_FE_test.txt',X_test_sparse)\n",
    "\n",
    "#存为libsvm稀疏格式，直接调用XGBoost的话用稀疏格式更高效\n",
    "#from sklearn.datasets import dump_svmlight_file\n",
    "#dump_svmlight_file(X_train_sparse, y_train, dpath + 'RentListingInquries_FE_train.txt', zero_based=False) \n",
    "\n",
    "import xgboost as xgb\n",
    "\n",
    "dtrain = xgb.DMatrix(X_train_sparse, label = y_train)\n",
    "dtest = xgb.DMatrix(X_test_sparse)\n",
    "\n",
    "dtrain.save_binary(dpath + 'RentListingInquries_FE_train.bin')\n",
    "dtest.save_binary(dpath + 'RentListingInquries_FE_test.bin')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
